#  File src/library/tools/R/urltools.R
#  Part of the R package, https://www.R-project.org
#
#  Copyright (C) 2015-2019 The R Core Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  https://www.R-project.org/Licenses/

get_IANA_URI_scheme_db <-
function()
{
    ## See
    ## <https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml>.
    baseurl <- "https://www.iana.org/assignments/uri-schemes/"
    db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")),
                          stringsAsFactors = FALSE, encoding = "UTF-8")
    names(db) <- chartr(".", "_", names(db))
    db
}

parse_URI_reference <-
function(x)
{
    ## See RFC_3986 <http://www.ietf.org/rfc/rfc3986.txt>.
    re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
    if(length(x)) {
        y <- do.call(rbind, regmatches(x, regexec(re, x)))
        y <- y[, c(3, 5, 6, 8, 10), drop = FALSE]
    } else {
        y <- matrix(character(), 0L, 5L)
    }
    colnames(y) <- c("scheme", "authority", "path", "query", "fragment")
    y
}

.get_urls_from_Rd <-
function(x)
{
    urls <- character()
    recurse <- function(e) {
        tag <- attr(e, "Rd_tag")
        ## Rd2HTML and Rd2latex remove whitespace and \n from URLs.
        if(identical(tag, "\\url")) {
            urls <<- c(urls, lines2str(.Rd_deparse(e, tag = FALSE)))
        } else if(identical(tag, "\\href")) {
            urls <<- c(urls, lines2str(.Rd_deparse(e[[1L]], tag = FALSE)))
        } else if(is.list(e))
            lapply(e, recurse)
    }
    lapply(x, recurse)
    unique(trimws(urls))
}

.get_urls_from_HTML_file <-
function(f)
{
    doc <- xml2::read_html(f)
    if(!inherits(doc, "xml_node")) return(character())
    nodes <- xml2::xml_find_all(doc, "//a")
    hrefs <- xml2::xml_attr(nodes, "href")
    unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
}

.get_urls_from_PDF_file <-
function(f)    
{
    ## Seems there is no straightforward way to extract hyperrefs from a
    ## PDF, hence first convert to HTML.
    ## Note that pdftohtml always outputs in cwd ...
    owd <- getwd()
    dir.create(d <- tempfile())
    on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
    file.copy(normalizePath(f), d)
    setwd(d)
    g <- tempfile(tmpdir = d, fileext = ".xml")
    system2("pdftohtml",
            c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g))))
    ## Oh dear: seems that pdftohtml can fail without a non-zero exit
    ## status.
    if(file.exists(g))
        .get_urls_from_HTML_file(g)
    else
        character()
}

url_db <-
function(urls, parents)
{
    ## Some people get leading LFs in URLs, so trim before checking.
    db <- data.frame(URL = trimws(as.character(urls)),
                     Parent = as.character(parents),
                     stringsAsFactors = FALSE)
    class(db) <- c("url_db", "data.frame")
    db
}

url_db_from_HTML_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
    urls <- parents <- character()
    if(is.null(files)) 
        files <- list.files(dir, pattern = "[.]html$",
                            full.names = TRUE,
                            recursive = recursive)
    urls <-
        lapply(files,
               function(f) {
                   if(verbose)
                       message(sprintf("processing %s",
                                       .file_path_relative_to_dir(f, dir)))
                   .get_urls_from_HTML_file(f)
               })
    names(urls) <- files
    urls <- Filter(length, urls)
    if(length(urls)) {
        parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                           lengths(urls))
        urls <- unlist(urls, use.names = FALSE)
    }
    url_db(urls, parents)
}

url_db_from_PDF_files <-
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
{
    urls <- parents <- character()
    if(is.null(files))
        files <- list.files(dir, pattern = "[.]pdf$",
                            full.names = TRUE,
                            recursive = recursive)
    ## FIXME: this is simpler to do with full.names = FALSE and without
    ## tools:::.file_path_relative_to_dir().
    urls <-
        lapply(files,
               function(f) {
                   if(verbose)
                       message(sprintf("processing %s",
                                       .file_path_relative_to_dir(f, dir)))
                   .get_urls_from_PDF_file(f)
               })
    names(urls) <- files
    urls <- Filter(length, urls)
    if(length(urls)) {
        parents <- rep.int(.file_path_relative_to_dir(names(urls), dir),
                           lengths(urls))
        urls <- unlist(urls, use.names = FALSE)
    }
    url_db(urls, parents)
}

url_db_from_package_Rd_db <-
function(db)
{
    urls <- Filter(length, lapply(db, .get_urls_from_Rd))
    url_db(unlist(urls, use.names = FALSE),
           rep.int(file.path("man", names(urls)),
                   lengths(urls)))
}

url_db_from_package_metadata <-
function(meta)
{
    urls <- character()
    fields <- c("URL", "BugReports")
    for(v in meta[fields]) {
        if(is.na(v)) next
        pattern <-
            "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]*>"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
        regmatches(v, m) <- ""
        pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
    }
    if(!is.na(v <- meta["Description"])) {
        pattern <-
            "<(URL: *)?((https?|ftp)://[^[:space:]]+)[[:space:]]*>"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
        regmatches(v, m) <- ""
        pattern <-
            "([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])"
        m <- gregexpr(pattern, v)
        urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L))
    }

    url_db(urls, rep.int("DESCRIPTION", length(urls)))
}

url_db_from_package_citation <-
function(dir, meta, installed = FALSE)
{
    urls <- character()
    path <- if(installed) "CITATION" else file.path("inst", "CITATION")
    cfile <- file.path(dir, path)
    if(file.exists(cfile)) {
        cinfo <- .read_citation_quietly(cfile, meta)
        if(!inherits(cinfo, "error"))
            urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE)))
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_news <-
function(dir, installed = FALSE)
{
    path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd")
    nfile <- file.path(dir, path)
    urls <-
        if(file.exists(nfile)) {
            macros <- initialRdMacros()
            .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros),
                                         stages = "install"))
        } else character()
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_HTML_files <-
function(dir, installed = FALSE)
{
    path <- if(installed) "doc" else file.path("inst", "doc")
    files <- Sys.glob(file.path(dir, path, "*.html"))
    if(installed && file.exists(rfile <- file.path(dir, "README.html")))
        files <- c(files, rfile)
    url_db_from_HTML_files(dir, files = files)
}

url_db_from_package_README_md <-
function(dir, installed = FALSE)
{
    urls <- path <- character()
    rfile <- Filter(file.exists,
                    c(if(!installed)
                          file.path(dir, "inst", "README.md"),
                      file.path(dir, "README.md")))[1L]
    if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) {
        path <- .file_path_relative_to_dir(rfile, dir)
        tfile <- tempfile("README", fileext = ".html")
        on.exit(unlink(tfile))
        out <- .pandoc_md_for_CRAN(rfile, tfile)
        if(!out$status) {
            urls <- .get_urls_from_HTML_file(tfile)
        }
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_NEWS_md <-
function(dir, installed = FALSE)
{
    urls <- path <- character()
    nfile <- Filter(file.exists,
                    c(if(!installed)
                          file.path(dir, "inst", "NEWS.md"),
                      file.path(dir, "NEWS.md")))[1L]
    if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) {
        path <- .file_path_relative_to_dir(nfile, dir)
        tfile <- tempfile("NEWS", fileext = ".html")
        on.exit(unlink(tfile))
        out <- .pandoc_md_for_CRAN(nfile, tfile)
        if(!out$status) {
            urls <- .get_urls_from_HTML_file(tfile)
        }
    }
    url_db(urls, rep.int(path, length(urls)))
}

url_db_from_package_sources <-
function(dir, add = FALSE) {
    meta <- .read_description(file.path(dir, "DESCRIPTION"))
    db <- rbind(url_db_from_package_metadata(meta),
                url_db_from_package_Rd_db(Rd_db(dir = dir)),
                url_db_from_package_citation(dir, meta),
                url_db_from_package_news(dir))
    if(requireNamespace("xml2", quietly = TRUE)) {
        db <- rbind(db,
                    url_db_from_package_HTML_files(dir),
                    url_db_from_package_README_md(dir),
                    url_db_from_package_NEWS_md(dir)
                    )
    }
    if(add)
        db$Parent <- file.path(basename(dir), db$Parent)
    db
}

url_db_from_installed_packages <-
function(packages, lib.loc = NULL, verbose = FALSE)
{
    if(!length(packages)) return()
    one <- function(p) {
        if(verbose)
            message(sprintf("processing %s", p))
        dir <- system.file(package = p, lib.loc = lib.loc)
        if(dir == "") return()
        meta <- .read_description(file.path(dir, "DESCRIPTION"))
        rddb <- Rd_db(p, lib.loc = dirname(dir))
        db <- rbind(url_db_from_package_metadata(meta),
                    url_db_from_package_Rd_db(rddb),
                    url_db_from_package_citation(dir, meta,
                                                 installed = TRUE),
                    url_db_from_package_news(dir, installed = TRUE))
        if(requireNamespace("xml2", quietly = TRUE)) {
            db <- rbind(db,
                        url_db_from_package_HTML_files(dir,
                                                       installed = TRUE),
                        url_db_from_package_README_md(dir,
                                                      installed = TRUE),
                        url_db_from_package_NEWS_md(dir,
                                                    installed = TRUE)
                        )
        }
        db$Parent <- file.path(p, db$Parent)
        db
    }
    do.call(rbind,
            c(lapply(packages, one),
              list(make.row.names = FALSE)))
}

get_IANA_HTTP_status_code_db <-
function()
{
    ## See
    ## <https://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml>
    baseurl <- "https://www.iana.org/assignments/http-status-codes/"
    db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")),
                          stringsAsFactors = FALSE)
    ## Drop "Unassigned".
    db[db$Description != "Unassigned", ]
}

## See <https://en.wikipedia.org/wiki/List_of_FTP_server_return_codes>
## and <http://tools.ietf.org/html/rfc959>,
## Section 4.2.2 "Numeric Order List of Reply Codes",
## and <https://tools.ietf.org/html/rfc2228>,
## Section 5 "New FTP Replies".
## Only need those >= 400.
table_of_FTP_server_return_codes <-
    c("421" = "Service not available, closing control connection.",
      "425" = "Can't open data connection.",
      "426" = "Connection closed; transfer aborted.",
      "430" = "Invalid username or password",
      "431" = "Need some unavailable resource to process security.",
      "434" = "Requested host unavailable.",
      "450" = "Requested file action not taken.",
      "451" = "Requested action aborted: local error in processing.",
      "452" = "Requested action not taken.  Insufficient storage space in system.",
      "500" = "Syntax error, command unrecognized.",
      "501" = "Syntax error in parameters or arguments.",
      "502" = "Command not implemented.",
      "503" = "Bad sequence of commands.",
      "504" = "Command not implemented for that parameter.",
      "530" = "Not logged in.",
      "532" = "Need account for storing files.",
      "533" = "Command protection level denied for policy reasons.",
      "534" = "Request denied for policy reasons.",
      "535" = "Failed security check (hash, sequence, etc).",
      "536" = "Requested PROT level not supported by mechanism.",
      "537" = "Command protection level not supported by security mechanism.",
      "550" = "Requested action not taken.  File unavailable",
      "551" = "Requested action aborted: page type unknown.",
      "552" = "Requested file action aborted.  Exceeded storage allocation (for current directory or dataset).",
      "553" = "Requested action not taken.  File name not allowed.",
      "631" = "Integrity protected reply.",
      "632" = "Confidentiality and integrity protected reply.",
      "633" = "Confidentiality protected reply."
      )

check_url_db <-
function(db, remote = TRUE, verbose = FALSE)
{
    use_curl <-
        config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_",
                                         "TRUE")) &&
        requireNamespace("curl", quietly = TRUE)

    .gather <- function(u = character(),
                        p = list(),
                        s = rep.int("", length(u)),
                        m = rep.int("", length(u)),
                        new = rep.int("", length(u)),
                        cran = rep.int("", length(u)),
                        spaces = rep.int("", length(u)),
                        R = rep.int("", length(u))) {
        y <- data.frame(URL = u, From = I(p), Status = s, Message = m,
                        New = new, CRAN = cran, Spaces = spaces, R = R,
                        row.names = NULL, stringsAsFactors = FALSE)
        y$From <- p
        class(y) <- c("check_url_db", "data.frame")
        y
    }

    .fetch <- function(u) {
        if(verbose) message(sprintf("processing %s", u))
        h <- tryCatch(curlGetHeaders(u), error = identity)
        if(inherits(h, "error")) {
            ## Currently, this info is only used in .check_http().
            ## Might be useful for checking ftps too, so simply leave it
            ## here instead of moving to .check_http().
            msg <- conditionMessage(h)
            if (grepl("libcurl error code (51|60)", msg)) {
                h2 <- tryCatch(curlGetHeaders(u, verify = FALSE),
                               error = identity)
                attr(h, "no-verify") <- h2
            }
        }
        h
    }

    .check_ftp <- function(u) {
        h <- .fetch(u)
        if(inherits(h, "error")) {
            s <- "-1"
            msg <- sub("[[:space:]]*$", "", conditionMessage(h))
        } else {
            s <- as.character(attr(h, "status"))
            msg <- table_of_FTP_server_return_codes[s]
        }
        c(s, msg, "", "")
    }

    .check_http <- if(remote)
        function(u) c(.check_http_A(u), .check_http_B(u))
    else
        function(u) c(rep.int("", 3L), .check_http_B(u))

    .check_http_A <- function(u) {
        h <- .fetch(u)
        newLoc <- ""
        if(inherits(h, "error")) {
            s <- "-1"
            msg <- sub("[[:space:]]*$", "", conditionMessage(h))
            if (!is.null(v <- attr(h, "no-verify"))) {
                s2 <- as.character(attr(v, "status"))
                msg <- paste0(msg, "\n\t(Status without verification: ",
                              table_of_HTTP_status_codes[s2], ")")
            }
        } else {
            s <- as.character(attr(h, "status"))
            msg <- table_of_HTTP_status_codes[s]
        }
        ## Look for redirected URLs
        if (any(grepl("301 Moved Permanently", h, useBytes = TRUE))) {
            ind <- grep("^[Ll]ocation: ", h, useBytes = TRUE)
            if (length(ind))
                newLoc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[max(ind)])
        }
        ##
        if((s != "200") && use_curl) {
            g <- .curl_GET_status(u)
            if(g == "200") {
                s <- g
                msg <- "OK"
            }
        }
        ## A mis-configured site
        if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc))))
            s <- "405"
        c(s, msg, newLoc)
    }

    .check_http_B <- function(u) {
        ul <- tolower(u)
        cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) &&
                  !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]]+(html|pdf|rds)$",
                         ul)) ||
                 (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$",
                        ul)) ||
                 startsWith(ul, "http://cran.r-project.org") ||
                 any(startsWith(ul, mirrors)))
        R <- grepl("^http://(www|bugs|journal).r-project.org", ul)
        spaces <- grepl(" ", u)
        c(if(cran) u else "", if(spaces) u else "", if(R) u else "")
    }

    bad <- .gather()

    if(!NROW(db)) return(bad)

    ## Could also use utils::getCRANmirrors(local.only = TRUE).
    mirrors <- c(utils::read.csv(file.path(R.home("doc"),
                                           "CRAN_mirrors.csv"),
                                 as.is = TRUE, encoding = "UTF-8")$URL,
                 "http://cran.rstudio.com/",
                 "https://cran.rstudio.com/")
    mirrors <- tolower(sub("/$", "", mirrors))

    if(inherits(db, "check_url_db")) {
        ## Allow re-checking check results.
        parents <- db$From
        urls <- db$URL
    } else {
        parents <- split(db$Parent, db$URL)
        urls <- names(parents)
    }

    parts <- parse_URI_reference(urls)

    ## Empty URLs.
    ind <- apply(parts == "", 1L, all)
    if(any(ind)) {
        len <- sum(ind)
        bad <- rbind(bad,
                     .gather(urls[ind],
                             parents[ind],
                             m = rep.int("Empty URL", len)))
    }

    ## Invalid URI schemes.
    schemes <- parts[, 1L]
    ind <- is.na(match(schemes,
                       c("",
                         IANA_URI_scheme_db$URI_Scheme,
                         ## Also allow 'javascript' scheme, see
                         ## <https://tools.ietf.org/html/draft-hoehrmann-javascript-scheme-03>
                         ## (but apparently never registered with IANA).
                         "javascript")))
    if(any(ind)) {
        len <- sum(ind)
        msg <- rep.int("Invalid URI scheme", len)
        doi <- schemes[ind] == "doi"
        if(any(doi))
            msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)")
        bad <- rbind(bad,
                     .gather(urls[ind], parents[ind], m = msg))
    }

    ## ftp.
    pos <- which(schemes == "ftp")
    if(length(pos) && remote) {
        results <- do.call(rbind, lapply(urls[pos], .check_ftp))
        status <- as.numeric(results[, 1L])
        ind <- (status < 0L) | (status >= 400L)
        if(any(ind)) {
            pos <- pos[ind]
            s <- as.character(status[ind])
            s[s == "-1"] <- "Error"
            m <- results[ind, 2L]
            m[is.na(m)] <- ""
            bad <- rbind(bad,
                         .gather(urls[pos], parents[pos], s, m))
        }
    }

    ## http/https.
    pos <- which(schemes == "http" | schemes == "https")
    if(length(pos)) {
        results <- do.call(rbind, lapply(urls[pos], .check_http))
        status <- as.numeric(results[, 1L])
        ## 405 is HTTP not allowing HEAD requests
        ## maybe also skip 500, 503, 504 as likely to be temporary issues
        ind <- is.na(match(status, c(200L, 405L, NA))) |
            nzchar(results[, 4L]) |
            nzchar(results[, 5L]) |
            nzchar(results[, 6L])
        if(any(ind)) {
            pos <- pos[ind]
            s <- as.character(status[ind])
            s[is.na(s)] <- ""
            s[s == "-1"] <- "Error"
            m <- results[ind, 2L]
            m[is.na(m)] <- ""
            bad <- rbind(bad,
                         .gather(urls[pos], parents[pos], s, m,
                                 results[ind, 3L],
                                 results[ind, 4L],
                                 results[ind, 5L],
                                 results[ind, 6L]))
        }
    }
    bad
}

format.check_url_db <-
function(x, ...)
{
    if(!NROW(x)) return(character())

    u <- x$URL
    new <- x$New
    ind <- nzchar(new)
    u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind])

    paste0(sprintf("URL: %s", u),
           sprintf("\nFrom: %s",
                   vapply(x$From, paste, "", collapse = "\n      ")),
           ifelse((s <- x$Status) == "",
                  "",
                  sprintf("\nStatus: %s", s)),
           ifelse((m <- x$Message) == "",
                  "",
                  sprintf("\nMessage: %s", gsub("\n", "\n  ", m, fixed=TRUE))),
           ifelse((m <- x$Spaces) == "",
                  "",
                  "\nURL contains spaces"),
           ifelse((m <- x$CRAN) == "",
                  "",
                  "\nCRAN URL not in canonical form"),
           ifelse((m <- x$R) == "",
                  "",
                  "\nR-project URL not in canonical form")
           )
}

print.check_url_db <-
function(x, ...)
{
    if(NROW(x))
        writeLines(paste(format(x), collapse = "\n\n"))
    invisible(x)
}

as.matrix.check_url_db <-
function(x, ...)
{
    n <- lengths(x[["From"]])
    y <- do.call(cbind,
                 c(list(URL = rep.int(x[["URL"]], n),
                        Parent = unlist(x[["From"]])),
                   lapply(x[-c(1L, 2L)], rep.int, n)))
    rownames(y) <- NULL
    y
}

.curl_GET_status <-
function(u, verbose = FALSE)
{
    if(verbose)
        message(sprintf("processing %s", u))
    ## Configure curl handle for better luck with JSTOR URLs/DOIs.
    ## Alternatively, special-case requests to
    ##   https?://doi.org/10.2307
    ##   https?://www.jstor.org
    h <- curl::new_handle()
    curl::handle_setopt(h,
                        cookiesession = 1,
                        followlocation = 1,
                        http_version = 2L,
                        ssl_enable_alpn = 0)
    g <- tryCatch(curl::curl_fetch_memory(u, handle = h),
                  error = identity)
    if(inherits(g, "error"))
        -1L
    else
        g$status_code
}
